In [1]:
our_texts=["The cat sat on the mat", 
           "The cat saw the other cat on Sat while she sat", 
           "Excellent Smithers! Use the saw on the cat in the magic trick with the dog",
            "Excellent magic saw Smithers on the dog"]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=0.1, stop_words='english', use_idf=True)
document_term_matrix=vectorizer.fit_transform(our_texts)
vocab=vectorizer.get_feature_names()

In [3]:
vocab


Out[3]:
['cat',
 'dog',
 'excellent',
 'magic',
 'mat',
 'sat',
 'saw',
 'smithers',
 'trick',
 'use']

In [4]:
import pandas as pd

In [5]:
pd.DataFrame(document_term_matrix.toarray(), columns=vocab)


Out[5]:
cat dog excellent magic mat sat saw smithers trick use
0 0.448100 0.000000 0.000000 0.000000 0.702035 0.553492 0.000000 0.000000 0.000000 0.000000
1 0.600223 0.000000 0.000000 0.000000 0.000000 0.741395 0.300111 0.000000 0.000000 0.000000
2 0.277223 0.342426 0.342426 0.342426 0.000000 0.000000 0.277223 0.342426 0.434323 0.434323
3 0.000000 0.463468 0.463468 0.463468 0.000000 0.000000 0.375218 0.463468 0.000000 0.000000

Similarity among documents


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
similarity=cosine_similarity(document_term_matrix)

In [8]:
pd.DataFrame(similarity)


Out[8]:
0 1 2 3
0 1.000000 0.679316 0.124224 0.000000
1 0.679316 1.000000 0.249593 0.112607
2 0.124224 0.249593 1.000000 0.738833
3 0.000000 0.112607 0.738833 1.000000

What if want to understand which words are more similar in this context?

Similarity among words


In [9]:
similarity=cosine_similarity(document_term_matrix.T)

In [10]:
pd.DataFrame(similarity, index=vocab, columns=vocab)


Out[10]:
cat dog excellent magic mat sat saw smithers trick use
cat 1.000000 0.206256 0.206256 0.206256 0.561040 0.937830 0.580043 0.206256 0.347095 0.347095
dog 0.206256 1.000000 1.000000 1.000000 0.000000 0.000000 0.841010 1.000000 0.594236 0.594236
excellent 0.206256 1.000000 1.000000 1.000000 0.000000 0.000000 0.841010 1.000000 0.594236 0.594236
magic 0.206256 1.000000 1.000000 1.000000 0.000000 0.000000 0.841010 1.000000 0.594236 0.594236
mat 0.561040 0.000000 0.000000 0.000000 1.000000 0.598232 0.000000 0.000000 0.000000 0.000000
sat 0.937830 0.000000 0.000000 0.000000 0.598232 1.000000 0.433532 0.000000 0.000000 0.000000
saw 0.580043 0.841010 0.841010 0.841010 0.000000 0.433532 1.000000 0.841010 0.499758 0.499758
smithers 0.206256 1.000000 1.000000 1.000000 0.000000 0.000000 0.841010 1.000000 0.594236 0.594236
trick 0.347095 0.594236 0.594236 0.594236 0.000000 0.000000 0.499758 0.594236 1.000000 1.000000
use 0.347095 0.594236 0.594236 0.594236 0.000000 0.000000 0.499758 0.594236 1.000000 1.000000

In [ ]: